dFtraining <- read.csv("pml-training.csv", stringsAsFactors = F,na.strings = c("","NA","#DIV/0!"))
dFtesting <- read.csv("pml-testing.csv", stringsAsFactors = F,na.strings = c("","NA","#DIV/0!"))
dim(dFtraining); dim(dFtesting)
## [1] 19622   160
## [1]  20 160
set.seed(101)
iNtraining <- createDataPartition(dFtraining$classe, p = 0.8, list = F)
dFvalue <- dFtraining[-iNtraining,]
dFtraining <- dFtraining[iNtraining,]
dim(dFtraining); dim(dFvalue)
## [1] 15699   160
## [1] 3923  160
table(dFtraining$classe)/nrow(dFtraining)
## 
##         A         B         C         D         E 
## 0.2843493 0.1935155 0.1744060 0.1638958 0.1838334
beltmissing <- sapply(select(dFtraining,names(dFtraining)[grepl("_belt",names(dFtraining))]),
                    function(x) sum(is.na(x)))
beltmissing
##            roll_belt           pitch_belt             yaw_belt 
##                    0                    0                    0 
##     total_accel_belt   kurtosis_roll_belt  kurtosis_picth_belt 
##                    0                15396                15413 
##    kurtosis_yaw_belt   skewness_roll_belt skewness_roll_belt.1 
##                15699                15395                15413 
##    skewness_yaw_belt        max_roll_belt       max_picth_belt 
##                15699                15388                15388 
##         max_yaw_belt        min_roll_belt       min_pitch_belt 
##                15396                15388                15388 
##         min_yaw_belt  amplitude_roll_belt amplitude_pitch_belt 
##                15396                15388                15388 
##   amplitude_yaw_belt var_total_accel_belt        avg_roll_belt 
##                15396                15388                15388 
##     stddev_roll_belt        var_roll_belt       avg_pitch_belt 
##                15388                15388                15388 
##    stddev_pitch_belt       var_pitch_belt         avg_yaw_belt 
##                15388                15388                15388 
##      stddev_yaw_belt         var_yaw_belt         gyros_belt_x 
##                15388                15388                    0 
##         gyros_belt_y         gyros_belt_z         accel_belt_x 
##                    0                    0                    0 
##         accel_belt_y         accel_belt_z        magnet_belt_x 
##                    0                    0                    0 
##        magnet_belt_y        magnet_belt_z 
##                    0                    0
armmissing <- sapply(select(dFtraining,names(dFtraining)[grepl("_arm",names(dFtraining))]),
                   function(x) sum(is.na(x)))
armmissing
##            roll_arm           pitch_arm             yaw_arm     total_accel_arm 
##                   0                   0                   0                   0 
##       var_accel_arm        avg_roll_arm     stddev_roll_arm        var_roll_arm 
##               15388               15388               15388               15388 
##       avg_pitch_arm    stddev_pitch_arm       var_pitch_arm         avg_yaw_arm 
##               15388               15388               15388               15388 
##      stddev_yaw_arm         var_yaw_arm         gyros_arm_x         gyros_arm_y 
##               15388               15388                   0                   0 
##         gyros_arm_z         accel_arm_x         accel_arm_y         accel_arm_z 
##                   0                   0                   0                   0 
##        magnet_arm_x        magnet_arm_y        magnet_arm_z   kurtosis_roll_arm 
##                   0                   0                   0               15446 
##  kurtosis_picth_arm    kurtosis_yaw_arm   skewness_roll_arm  skewness_pitch_arm 
##               15448               15398               15445               15448 
##    skewness_yaw_arm        max_roll_arm       max_picth_arm         max_yaw_arm 
##               15398               15388               15388               15388 
##        min_roll_arm       min_pitch_arm         min_yaw_arm  amplitude_roll_arm 
##               15388               15388               15388               15388 
## amplitude_pitch_arm   amplitude_yaw_arm 
##               15388               15388
for_ear_miss <- sapply(select(dFtraining,
                              names(dFtraining)[grepl("_forearm",names(dFtraining))]),
                       function(x) sum(is.na(x)))
for_ear_miss
##            roll_forearm           pitch_forearm             yaw_forearm 
##                       0                       0                       0 
##   kurtosis_roll_forearm  kurtosis_picth_forearm    kurtosis_yaw_forearm 
##                   15448                   15449                   15699 
##   skewness_roll_forearm  skewness_pitch_forearm    skewness_yaw_forearm 
##                   15447                   15449                   15699 
##        max_roll_forearm       max_picth_forearm         max_yaw_forearm 
##                   15388                   15388                   15448 
##        min_roll_forearm       min_pitch_forearm         min_yaw_forearm 
##                   15388                   15388                   15448 
##  amplitude_roll_forearm amplitude_pitch_forearm   amplitude_yaw_forearm 
##                   15388                   15388                   15448 
##     total_accel_forearm       var_accel_forearm        avg_roll_forearm 
##                       0                   15388                   15388 
##     stddev_roll_forearm        var_roll_forearm       avg_pitch_forearm 
##                   15388                   15388                   15388 
##    stddev_pitch_forearm       var_pitch_forearm         avg_yaw_forearm 
##                   15388                   15388                   15388 
##      stddev_yaw_forearm         var_yaw_forearm         gyros_forearm_x 
##                   15388                   15388                       0 
##         gyros_forearm_y         gyros_forearm_z         accel_forearm_x 
##                       0                       0                       0 
##         accel_forearm_y         accel_forearm_z        magnet_forearm_x 
##                       0                       0                       0 
##        magnet_forearm_y        magnet_forearm_z 
##                       0                       0
Dumb_Bell_Missing <- sapply(select(dFtraining,
                               names(dFtraining)[grepl("_dumbbell",names(dFtraining))]),
                        function(x) sum(is.na(x)))
Dumb_Bell_Missing
##            roll_dumbbell           pitch_dumbbell             yaw_dumbbell 
##                        0                        0                        0 
##   kurtosis_roll_dumbbell  kurtosis_picth_dumbbell    kurtosis_yaw_dumbbell 
##                    15392                    15390                    15699 
##   skewness_roll_dumbbell  skewness_pitch_dumbbell    skewness_yaw_dumbbell 
##                    15391                    15389                    15699 
##        max_roll_dumbbell       max_picth_dumbbell         max_yaw_dumbbell 
##                    15388                    15388                    15392 
##        min_roll_dumbbell       min_pitch_dumbbell         min_yaw_dumbbell 
##                    15388                    15388                    15392 
##  amplitude_roll_dumbbell amplitude_pitch_dumbbell   amplitude_yaw_dumbbell 
##                    15388                    15388                    15392 
##     total_accel_dumbbell       var_accel_dumbbell        avg_roll_dumbbell 
##                        0                    15388                    15388 
##     stddev_roll_dumbbell        var_roll_dumbbell       avg_pitch_dumbbell 
##                    15388                    15388                    15388 
##    stddev_pitch_dumbbell       var_pitch_dumbbell         avg_yaw_dumbbell 
##                    15388                    15388                    15388 
##      stddev_yaw_dumbbell         var_yaw_dumbbell         gyros_dumbbell_x 
##                    15388                    15388                        0 
##         gyros_dumbbell_y         gyros_dumbbell_z         accel_dumbbell_x 
##                        0                        0                        0 
##         accel_dumbbell_y         accel_dumbbell_z        magnet_dumbbell_x 
##                        0                        0                        0 
##        magnet_dumbbell_y        magnet_dumbbell_z 
##                        0                        0
Col_2Dr <- c(names(beltmissing[beltmissing != 0]), 
                  names(armmissing[armmissing != 0]),
                  names(for_ear_miss[for_ear_miss != 0]),
                  names(Dumb_Bell_Missing[Dumb_Bell_Missing != 0]))
length(Col_2Dr)
## [1] 100
dF_anly <- tbl_df(dFtraining %>% 
                      select(-Col_2Dr,
                             -c(X,user_name, raw_timestamp_part_1, 
                                raw_timestamp_part_2, cvtd_timestamp, 
                                new_window,num_window)))
## Warning: `tbl_df()` is deprecated as of dplyr 1.0.0.
## Please use `tibble::as_tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(Col_2Dr)` instead of `Col_2Dr` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
dF_anly$classe <- as.factor(dF_anly$classe)
dF_anly[,1:52] <- lapply(dF_anly[,1:52],as.numeric)
dim(dF_anly)
## [1] 15699    53
corres_Column <- cor(select(dF_anly, -classe))
diag(corres_Column) <- 0
corres_Column <- which(abs(corres_Column)>0.8,arr.ind = T)
corres_Column <- unique(row.names(corres_Column))
corrplot(cor(select(dF_anly,corres_Column)),
         type="upper", order="hclust",method = "number")
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(corres_Column)` instead of `corres_Column` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.

#correlationfunnel website: https://business-science.github.io/correlationfunnel/
corr_funl_df <- dF_anly %>% binarize(n_bins = 4, thresh_infreq = 0.01)
Corres_a <- corr_funl_df %>% correlate(target = classe__A) 
Corres_a %>% plot_correlation_funnel(interactive = T,limits = c(-0.5,0.5))
Corres_b <- corr_funl_df %>% correlate(target = classe__B)
Corres_b %>% plot_correlation_funnel(interactive = T,limits = c(-0.5,0.5))
Corres_c <- corr_funl_df %>% correlate(target = classe__C)
Corres_c %>% plot_correlation_funnel(interactive = T,limits = c(-0.5,0.5))
Corres_d <- corr_funl_df %>% correlate(target = classe__D)
Corres_d %>% plot_correlation_funnel(interactive = T,limits = c(-0.5,0.5))
CorreS_e <- corr_funl_df %>% correlate(target = classe__E)
CorreS_e %>% plot_correlation_funnel(interactive = T,limits = c(-0.5,0.5))
#subseting dF_anly
Colum_a <- c("magnet_arm_x", "pitch_forearm" , "magnet_dumbbell_y", 
           "roll_forearm", "gyros_dumbbell_y") 
Colum_b <- c("magnet_dumbbell_y", "magnet_dumbbell_x" , "roll_dumbbell" , 
           "magnet_belt_y" , "accel_dumbbell_x" )
Colum_c <- c("magnet_dumbbell_y", "roll_dumbbell" , "accel_dumbbell_y" , 
           "magnet_dumbbell_x", "magnet_dumbbell_z")
Colum_d <- c("pitch_forearm" , "magnet_arm_y" , "magnet_forearm_x",
           "accel_dumbbell_y", "accel_forearm_x")
Colum_e <- c("magnet_belt_y" , "magnet_belt_z" , "roll_belt", 
           "gyros_belt_z" , "magnet_dumbbell_y")
FinaL_Colum <- character()
for(c in c(Colum_a,Colum_b,Colum_c,Colum_d,Colum_e)){
  FinaL_Colum <- union(FinaL_Colum, c)
}
dF_AnLy_2 <- dF_anly %>% select(FinaL_Colum, classe)
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(FinaL_Colum)` instead of `FinaL_Colum` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
data.frame("arm" = sum(grepl("_arm",FinaL_Colum)), 
           "forearm" = sum(grepl("_forearm",FinaL_Colum)),
           "belt" = sum(grepl("_belt",FinaL_Colum)),
           "dumbbell" = sum(grepl("_dumbbell",FinaL_Colum)))
##   arm forearm belt dumbbell
## 1   2       4    4        7
MyDenS <- function(data, mapping, ...) {
  ggplot(data = data, mapping=mapping) +
    geom_density(..., alpha = 0.3)+scale_fill_brewer(palette="Set2") 
}
MyPoinT <- function(data, mapping, ...) {
  ggplot(data = data, mapping=mapping) +
    geom_point(..., alpha = 0.1)+ scale_fill_brewer(palette="Set2") 
}
ggpairs(dF_AnLy_2, columns = 1:5,aes(color = classe),
        lower = list(continuous = MyPoinT),diag = list(continuous = MyDenS))

ggpairs(dF_AnLy_2, columns = 6:10,aes(color = classe),
        lower = list(continuous = MyPoinT),diag = list(continuous = MyDenS))

ggpairs(dF_AnLy_2, columns = 11:17,aes(color = classe),
        lower = list(continuous = MyPoinT),diag = list(continuous = MyDenS))

dF_TrainF <- dFtraining %>% select(FinaL_Colum,classe)
dF_ValuF <- dFvalue %>% select(FinaL_Colum,classe)
dF_TrainF[,1:17] <- sapply(dF_TrainF[,1:17],as.numeric)
dF_ValuF[,1:17] <- sapply(dF_ValuF[,1:17],as.numeric)
levels <- c("A", "B", "C", "D", "E")
Prep_RopObj <- preProcess(dF_TrainF[,-18],method = c("center","scale","BoxCox"))
X_Tain <- predict(Prep_RopObj,select(dF_TrainF,-classe))
Y_Tain <- factor(dF_TrainF$classe,levels=levels)
X_vaL <- predict(Prep_RopObj,select(dF_ValuF,-classe))
Y_vaL <- factor(dF_ValuF$classe,levels=levels)
trControl <- trainControl(method="cv", number=5)
CT_Modl <- train(x = X_Tain,y = Y_Tain, 
                 method = "rpart", trControl = trControl)

RF_Modl<- train(x = X_Tain,y = Y_Tain, 
                 method = "rf", trControl = trControl,verbose=FALSE, metric = "Accuracy")


GBM_Modl <- train(x = X_Tain,y = Y_Tain, 
                  method = "gbm",trControl=trControl, verbose=FALSE)

SVM_Modl <- svm(x = X_Tain,y = Y_Tain,
                kernel = "polynomial", cost = 10)
confusionMatrix(predict(CT_Modl,X_vaL),Y_vaL)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    A    B    C    D    E
##          A 1003  330  319  294  106
##          B   19  256   20  109  103
##          C   93  173  345  240  212
##          D    0    0    0    0    0
##          E    1    0    0    0  300
## 
## Overall Statistics
##                                           
##                Accuracy : 0.4853          
##                  95% CI : (0.4696, 0.5011)
##     No Information Rate : 0.2845          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.3271          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D Class: E
## Sensitivity            0.8987  0.33729  0.50439   0.0000  0.41609
## Specificity            0.6263  0.92067  0.77833   1.0000  0.99969
## Pos Pred Value         0.4888  0.50493  0.32455      NaN  0.99668
## Neg Pred Value         0.9396  0.85275  0.88147   0.8361  0.88377
## Prevalence             0.2845  0.19347  0.17436   0.1639  0.18379
## Detection Rate         0.2557  0.06526  0.08794   0.0000  0.07647
## Detection Prevalence   0.5231  0.12924  0.27097   0.0000  0.07673
## Balanced Accuracy      0.7625  0.62898  0.64136   0.5000  0.70789
confusionMatrix(predict(RF_Modl,X_vaL),Y_vaL)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    A    B    C    D    E
##          A 1112    7    0    0    0
##          B    3  741    5    3    1
##          C    1    7  676   15    4
##          D    0    4    3  625    1
##          E    0    0    0    0  715
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9862          
##                  95% CI : (0.9821, 0.9896)
##     No Information Rate : 0.2845          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9826          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D Class: E
## Sensitivity            0.9964   0.9763   0.9883   0.9720   0.9917
## Specificity            0.9975   0.9962   0.9917   0.9976   1.0000
## Pos Pred Value         0.9937   0.9841   0.9616   0.9874   1.0000
## Neg Pred Value         0.9986   0.9943   0.9975   0.9945   0.9981
## Prevalence             0.2845   0.1935   0.1744   0.1639   0.1838
## Detection Rate         0.2835   0.1889   0.1723   0.1593   0.1823
## Detection Prevalence   0.2852   0.1919   0.1792   0.1614   0.1823
## Balanced Accuracy      0.9970   0.9862   0.9900   0.9848   0.9958
plot(RF_Modl$finalModel,main="Error VS no of tree")

confusionMatrix(predict(GBM_Modl,X_vaL),Y_vaL)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    A    B    C    D    E
##          A 1083   40    2    5    3
##          B   18  642   32   16   11
##          C    8   54  635   39   10
##          D    4   21   14  582    9
##          E    3    2    1    1  688
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9253          
##                  95% CI : (0.9166, 0.9333)
##     No Information Rate : 0.2845          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9055          
##                                           
##  Mcnemar's Test P-Value : 2.509e-07       
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D Class: E
## Sensitivity            0.9704   0.8458   0.9284   0.9051   0.9542
## Specificity            0.9822   0.9757   0.9657   0.9854   0.9978
## Pos Pred Value         0.9559   0.8929   0.8512   0.9238   0.9899
## Neg Pred Value         0.9882   0.9635   0.9846   0.9815   0.9898
## Prevalence             0.2845   0.1935   0.1744   0.1639   0.1838
## Detection Rate         0.2761   0.1637   0.1619   0.1484   0.1754
## Detection Prevalence   0.2888   0.1833   0.1902   0.1606   0.1772
## Balanced Accuracy      0.9763   0.9108   0.9470   0.9452   0.9760
confusionMatrix(predict(SVM_Modl,X_vaL),Y_vaL)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    A    B    C    D    E
##          A 1096   40   18   17    2
##          B    1  676   15    5    6
##          C    9   40  640   45    3
##          D   10    3    9  575    9
##          E    0    0    2    1  701
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9401          
##                  95% CI : (0.9322, 0.9473)
##     No Information Rate : 0.2845          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9241          
##                                           
##  Mcnemar's Test P-Value : 1.808e-15       
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D Class: E
## Sensitivity            0.9821   0.8906   0.9357   0.8942   0.9723
## Specificity            0.9726   0.9915   0.9701   0.9905   0.9991
## Pos Pred Value         0.9344   0.9616   0.8684   0.9488   0.9957
## Neg Pred Value         0.9927   0.9742   0.9862   0.9795   0.9938
## Prevalence             0.2845   0.1935   0.1744   0.1639   0.1838
## Detection Rate         0.2794   0.1723   0.1631   0.1466   0.1787
## Detection Prevalence   0.2990   0.1792   0.1879   0.1545   0.1795
## Balanced Accuracy      0.9773   0.9411   0.9529   0.9424   0.9857
Test2_dF <- dFtesting %>% select(FinaL_Colum,problem_id)
Test_x <- Test2_dF %>% select(FinaL_Colum)
  
Res_ult <- data.frame("problem_id" = dFtesting$problem_id,
                     "PREDICTION_RF" = predict(RF_Modl,Test_x),
                     "PREDICTION_GBM" = predict(GBM_Modl,Test_x),
                     "PREDICTION_SVM" = predict(SVM_Modl,Test_x))
Res_ult
##    problem_id PREDICTION_RF PREDICTION_GBM PREDICTION_SVM
## 1           1             E              E              C
## 2           2             A              E              A
## 3           3             A              D              B
## 4           4             E              E              C
## 5           5             E              E              A
## 6           6             E              D              C
## 7           7             E              E              B
## 8           8             B              D              A
## 9           9             A              B              E
## 10         10             E              E              E
## 11         11             A              E              C
## 12         12             A              D              C
## 13         13             E              B              E
## 14         14             A              D              B
## 15         15             E              E              B
## 16         16             E              E              A
## 17         17             E              E              C
## 18         18             B              E              A
## 19         19             E              E              A
## 20         20             E              E              E
dF_TrainF2 <- tbl_df(dFtraining %>% 
                      select(-Col_2Dr,
                             -c(X,user_name, raw_timestamp_part_1, 
                                raw_timestamp_part_2, cvtd_timestamp, 
                                new_window,num_window)))
X_Tain2 <- dF_TrainF2 %>% select(-classe)
X_Tain2 <- sapply(X_Tain2,as.numeric)
Y_Tain2 <- factor(dF_TrainF2$classe,levels=levels)  
dF_ValuF2 <- tbl_df(dFvalue %>% 
                      select(-Col_2Dr,
                             -c(X,user_name, raw_timestamp_part_1, 
                                raw_timestamp_part_2, cvtd_timestamp, 
                                new_window,num_window)))
X_vaL2 <- dF_ValuF2 %>% select(-classe)
X_vaL2 <- sapply(X_vaL2,as.numeric)
Y_vaL2 <- factor(dF_ValuF2$classe,levels=levels) 
F2_Test_dF <- tbl_df(dFtesting %>% 
                      select(-Col_2Dr,
                             -c(X,user_name, raw_timestamp_part_1, 
                                raw_timestamp_part_2, cvtd_timestamp, 
                                new_window,num_window)))
X_2tesT <- F2_Test_dF %>% select(-problem_id)
X_2tesT <- sapply(X_2tesT,as.numeric)
ID_pb <- dF_ValuF2$classe
library(doParallel)
## Loading required package: foreach
## Loading required package: iterators
## Loading required package: parallel
Cores_N <- makeCluster(detectCores() - 1)
registerDoParallel(cores=Cores_N)
getDoParWorkers() 
## [1] 3
RF2_Modl <- train(x = X_Tain2,y = Y_Tain2, method = "rf", 
                 metric = "Accuracy", 
                 trControl=trainControl(method = "cv", number = 4, 
                                        p= 0.60, allowParallel = TRUE ))
Res_ult2 <- data.frame("problem_id" = dFtesting$problem_id,
                     "PREDICTION_RF" = predict(RF_Modl,Test_x),
                     "PREDICTION_GBM" = predict(GBM_Modl,Test_x),
                     "PREDICTION_SVM" = predict(SVM_Modl,Test_x),
                     "PREDICTION_RF2_ALL_COL"=predict(RF2_Modl,X_2tesT))
Res_ult2
##    problem_id PREDICTION_RF PREDICTION_GBM PREDICTION_SVM
## 1           1             E              E              C
## 2           2             A              E              A
## 3           3             A              D              B
## 4           4             E              E              C
## 5           5             E              E              A
## 6           6             E              D              C
## 7           7             E              E              B
## 8           8             B              D              A
## 9           9             A              B              E
## 10         10             E              E              E
## 11         11             A              E              C
## 12         12             A              D              C
## 13         13             E              B              E
## 14         14             A              D              B
## 15         15             E              E              B
## 16         16             E              E              A
## 17         17             E              E              C
## 18         18             B              E              A
## 19         19             E              E              A
## 20         20             E              E              E
##    PREDICTION_RF2_ALL_COL
## 1                       B
## 2                       A
## 3                       B
## 4                       A
## 5                       A
## 6                       E
## 7                       D
## 8                       B
## 9                       A
## 10                      A
## 11                      B
## 12                      C
## 13                      B
## 14                      A
## 15                      E
## 16                      E
## 17                      A
## 18                      B
## 19                      B
## 20                      B